import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sbn
import plotly_express as px
import us
import plotly.offline as py
import plotly.graph_objects as go
import us
py.init_notebook_mode()
data = pd.read_csv("Metro_zhvi_uc_sfrcondo_tier_0.33_0.67_raw_mon.csv")
data.head()
# extract city from region name
data.drop(0,axis=0,inplace=True)
data['city'] = data.RegionName.apply(lambda x: x.split(',')[0])
dates = data.columns.tolist()[5:-1]
def data_to_df(df,row,col,names):
df.set_index(row,inplace=True)
row = df.index.tolist()
d = []
for i in range(len(row)):
for j in range(len(col)):
d.append([row[i],col[j],df.at[row[i],col[j]]])
return pd.DataFrame(np.array(d),columns=names)
df = data_to_df(df = data, row='StateName', col=dates,names=['state','date','price'])
df.head()
df.price.fillna(0,inplace=True)
df.date = pd.to_datetime(df.date)
df.price = df.price.apply(lambda x: sum(np.nan_to_num(x))/len(np.nan_to_num(x)) if type(x)==np.ndarray else x)
# df['state'] = df.city.apply(lambda x: us.states.lookup(x).abbr if us.states.lookup(x) is not None else x)
df['year'] = df.date.dt.year
df.isna().sum()
gdp = pd.read_excel('gdp_by_state.xls',skiprows=5,skipfooter=3)
gdp.drop(0,axis=0,inplace=True)
year = gdp.columns[2:]
df_gdp = data_to_df(gdp,'GeoName',year,['state','year','gdp'])
df_gdp.year = df_gdp.year.apply(lambda x: int(x.split('-')[1]))
df_gdp.state = df_gdp.state.apply(lambda x: us.states.lookup(x).abbr if us.states.lookup(x) is not None else x)
df_gdp.head()
df = df.merge(df_gdp,left_on=['state','year'], right_on=['state','year'])
emp = pd.read_excel('by_state_data/Employment_by_state.xls',skiprows=5,skipfooter=5)
col = ['GeoFips','GeoName','1997','1998','1999','2000','2001','2002','2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019']
emp.drop(0,inplace=True)
emp = emp[col]
emp.head()
emp = data_to_df(df = emp, row='GeoName', col=col[2:],names=['state','year','emp_rate'])
emp.state = emp.state.apply(lambda x: us.states.lookup(x).abbr if us.states.lookup(x) is not None else x)
emp.year = emp.year.astype('int')
emp.emp_rate = emp.emp_rate.astype('float')
df = df.merge(emp,left_on=['state','year'], right_on=['state','year'])
interest = pd.read_csv("by_state_data/MORTGAGE30US_intrest_rate.csv")
interest.DATE = pd.to_datetime(interest.DATE)
interest.head()
interest = interest.groupby(interest.DATE.dt.year).mean().reset_index()
interest.columns = ['year','interest_rate']
df = df.merge(interest,left_on='year',right_on='year')
population = pd.read_excel('by_state_data/population_by_state.xls',skiprows=5,skipfooter=6)
population = population[col]
population.drop(0,axis=0,inplace=True)
population.head()
population = data_to_df(population,'GeoName',col[2:],['state', 'year', 'population_perc'])
population.year = population.year.astype('int')
population.population_perc = population.population_perc.astype('float')
population.state = population.state.apply(lambda x: us.states.lookup(x).abbr if us.states.lookup(x) is not None else x)
df = df.merge(population,left_on=['state','year'], right_on=['state','year'])
df.gdp = df.gdp.astype('float')
df['month'] = df.date.dt.month
hpi = pd.read_csv('House_price_by _state.csv')
col = ['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL',
'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV',
'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA',
'VT', 'WA', 'WI', 'WV', 'WY']
hpi = data_to_df(hpi, 'Date', col,['yr','state','hpi'] )
hpi.yr = pd.to_datetime(hpi.yr)
hpi.hpi = pd.to_numeric(hpi.hpi)
hpi['year'] = hpi.yr.dt.year
hpi['month'] = hpi.yr.dt.month
hpi.drop(columns='yr',inplace=True)
hpi.head()
df = df.merge(hpi, left_on=['state','year','month'], right_on=['state','year','month'])
income = pd.read_excel('personel income.xls',skiprows=10)
income.observation_date = pd.to_datetime(income.observation_date)
income['year'] = income.observation_date.dt.year
income['month'] = income.observation_date.dt.month
income.drop('observation_date',axis=1,inplace=True)
income.head()
df = df.merge(income,left_on=['year','month'],right_on=['year','month'])
df.head()
states = df.groupby('state')['price'].max().reset_index().sort_values(by='price',ascending=False)['state'][:5].values.tolist()
tmp = df[df.state.isin(states)].groupby(['year','state'])['price'].mean().dropna().reset_index()
px.line(tmp,
x='year',
y='price',
color='state',
title='Top 5 expensive state\'s House price from year 1998 to 2020 by state')
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
tmp = df.groupby('state')['gdp'].mean().reset_index()
tmp = tmp.dropna(subset=['state','gdp'], how='any')
data = [ dict(
type='choropleth',
colorscale = scl,
autocolorscale = False,
locations = tmp['state'],
z = tmp['gdp'].astype(float),
locationmode = 'USA-states',
marker = dict(
line = dict (
color = 'rgb(255,255,255)',
width = 2
) ),
colorbar = dict(
title = "GDP")
) ]
layout = dict(
title = 'Average GDP in different states',
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showlakes = True,
lakecolor = 'rgb(255, 255, 255)'),
)
fig = dict( data=data, layout=layout )
py.iplot(fig)
px.line(df,
x='year',
y='gdp',
color='state',
title='State wise gdp from year 1998 to 2020')
px.line(df,
x='year',
y='interest_rate',
title='Interest rate from year 1998 to 2020')
px.line(df[['year','price']].groupby('year').mean().reset_index(),
x='year',
y='price',
title='Average house price from year 1998 to 2020')
House Price Index(HPI): A house price index measures the price changes of residential housing as a percentage change from some specific start date.
px.line(df.drop_duplicates(subset=['month','year']),
x='date',
y='hpi',
title='House price index from year 1998 to 2020')
px.line(df,
x='year',
y='population_perc',
color='state',
title='Population percentage from year 1998 to 2020')
px.line(df,
x='year',
y='emp_rate',
color='state',
title='Employment percentage of total employment from year 1998 to 2020 by state',
labels={'emp_rate':'Employment percentage','year':'Year'})
px.line(df,
x='year',
y='PI',
title='US personal income in dollars from year 1998 to 2020',
labels={'year':'Year','PI':'Personal income in dollars'})
df.describe()
corr = df.corr()
plt.figure(figsize=(10,10))
sbn.heatmap(corr,annot=True,cmap='coolwarm')
plt.title('Feature correlation')
plt.show()
From above heatmap we can conclude
tmp = df[['state','price']].groupby('state').mean().reset_index()
tmp = tmp.sort_values(by='price',ascending=False)
# plt.figure(figsize=(15,8))
# px.bar(tmp,x='state',y ='price',color='state',title='Average House price by state')
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
data = [ dict(
type='choropleth',
colorscale = scl,
autocolorscale = False,
locations = tmp['state'],
z = tmp['price'].astype(float),
locationmode = 'USA-states',
marker = dict(
line = dict (
color = 'rgb(255,255,255)',
width = 2
) ),
colorbar = dict(
title = "GDP")
) ]
layout = dict(
title = 'Average House price by state',
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showlakes = True,
lakecolor = 'rgb(255, 255, 255)'),
)
fig = dict( data=data, layout=layout )
py.iplot(fig)
state_data = pd.read_csv('by_state_data/State_time_series.csv')
state_data.drop(columns=['DaysOnZillow_AllHomes',
'InventorySeasonallyAdjusted_AllHomes',
'InventoryRaw_AllHomes', 'MedianPriceCutDollar_AllHomes',
'MedianPriceCutDollar_CondoCoop',
'MedianPriceCutDollar_SingleFamilyResidence','PctOfListingsWithPriceReductionsSeasAdj_AllHomes',
'PctOfListingsWithPriceReductionsSeasAdj_CondoCoop',
'PctOfListingsWithPriceReductionsSeasAdj_SingleFamilyResidence','Sale_Counts_Seas_Adj'],inplace=True)
state_data.Date = pd.to_datetime(state_data.Date)
state_data['year'] = state_data.Date.dt.year
Zillow Home Value Index (ZHVI): A smoothed, seasonally adjusted measure of the typical home value and market changes across a given region and housing type. It reflects the typical value for homes in the 35th to 65th percentile range
var = ['ZHVI_1bedroom',
'ZHVI_2bedroom',
'ZHVI_3bedroom',
'ZHVI_4bedroom',
'ZHVI_5BedroomOrMore',]
tmp = state_data.groupby('year')[var].mean().reset_index()
fig = go.Figure()
for i in var:
fig.add_trace(go.Bar(x =tmp['year'], y=tmp[i], name=i.split('ZHVI_')[1]))
fig.update_layout(title='House value index for different year by no of rooms', xaxis_title='year',yaxis_title='ZHIV value')
fig.show()
tmp = state_data.groupby('RegionName')['ZHVIPerSqft_AllHomes'].mean().reset_index(name = "ZHVIpersqft_mean")
tmp.RegionName = tmp.RegionName.apply(lambda x: us.states.lookup(x).abbr if us.states.lookup(x) is not None else None )
tmp = tmp.dropna(subset=['ZHVIpersqft_mean','RegionName'], how='any')
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
data = [ dict(
type='choropleth',
colorscale = scl,
autocolorscale = False,
locations = tmp['RegionName'],
z = tmp['ZHVIpersqft_mean'].astype(float),
locationmode = 'USA-states',
marker = dict(
line = dict (
color = 'rgb(255,255,255)',
width = 2
) ),
colorbar = dict(
title = "Home value per square foot")
) ]
layout = dict(
title = 'Median house value per square foot in different states',
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showlakes = True,
lakecolor = 'rgb(255, 255, 255)'),
)
fig = dict( data=data, layout=layout )
py.iplot(fig)
px.bar(state_data.groupby('year')['ZHVIPerSqft_AllHomes'].mean().reset_index(),
x='year',
y='ZHVIPerSqft_AllHomes',
title='Median house value per square foot in different year',
labels={'ZHVIPerSqft_AllHomes':'House value index per sqft'})
fig = go.Figure()
var = ['MedianListingPricePerSqft_1Bedroom',
'MedianListingPricePerSqft_2Bedroom',
'MedianListingPricePerSqft_3Bedroom',
'MedianListingPricePerSqft_4Bedroom',
'MedianListingPricePerSqft_5BedroomOrMore',
'MedianListingPricePerSqft_DuplexTriplex',
'MedianListingPricePerSqft_SingleFamilyResidence']
tmp = state_data.groupby('year')[var].mean().dropna().reset_index()
for i in var:
fig.add_trace(go.Scatter(x=tmp['year'], y=tmp[i],
mode='lines',
name=i.split('MedianListingPricePerSqft_')[1]))
fig.update_layout(title ='Median house price per sqft by number of bedrooms', xaxis_title='year',yaxis_title='Price per sqft')
fig.show()
fig = go.Figure()
var = ['PctOfHomesIncreasingInValues_AllHomes','PctOfHomesDecreasingInValues_AllHomes']
tmp = state_data.groupby('year')[var].median().dropna().reset_index()
for i in var:
fig.add_trace(go.Scatter(x=tmp['year'], y=tmp[i],
mode='lines',
name=i))
fig.update_layout(title ='Percentage of house price increasing and decreasing in values', xaxis_title='year',yaxis_title='Percentage of house price')
fig.show()
Both the line alters between year 2008 and 2011
fig = go.Figure()
var = ['MedianRentalPrice_1Bedroom',
'MedianRentalPrice_2Bedroom',
'MedianRentalPrice_3Bedroom',
'MedianRentalPrice_4Bedroom',
'MedianRentalPrice_5BedroomOrMore',
'MedianRentalPrice_CondoCoop',
'MedianRentalPrice_DuplexTriplex',
'MedianRentalPrice_MultiFamilyResidence5PlusUnits',
'MedianRentalPrice_SingleFamilyResidence']
tmp = state_data.groupby('year')[var].mean().dropna().reset_index()
for i in var:
fig.add_trace(go.Scatter(x=tmp['year'], y=tmp[i],
mode='lines',
name=i.split('MedianRentalPrice_')[1]))
fig.update_layout(title ='Median rental price per sqft', xaxis_title='year',yaxis_title='Price per sqft')
fig.show()
List of variables inpacting the house prices will be
Some other variable which impacts the house price will be